# Adapted from https:#github.com/gflohr/Lingua-Poly/blob/master/src/app/core/validators/urlValidator.ts
# originally licensed under WTFPL v2

import ipaddress
import re

from pydantic import HttpUrl


class RestrictedHttpUrl(HttpUrl):
    @classmethod
    def __get_validators__(cls):
        yield cls.validate  # base validator
        yield cls.check_port
        yield cls.check_hostname

    @classmethod
    def check_port(cls, url: HttpUrl):
        if url.port not in [None, '80', '443']:
            raise ValueError("URLs with non standard ports are not allowed")
        return url

    @classmethod
    def check_hostname(cls, url: HttpUrl):
        # Do not allow username or password information.
        if url.user or url.password:
            raise ValueError("URLs with authentication are not allowed")

        # Split the hostname into labels.
        labels = url.host.split('.')

        # Discard an empty root label.
        if labels[-1] == '':
            labels.pop()

        # Otherwise empty labels are illegal.  Note that in Javascript, this
        # also filters out the illegal hostname "." like in "https:#.".
        empty = list(filter(lambda i: len(i) == 0, labels))
        if empty:
            raise ValueError("Empty labels in url host name are not allowed")

        # Numerical IP address?
        try:
            ip = ipaddress.ip_address(url.host)
            if (ip.is_loopback or ip.is_private or ip.is_link_local or
                    ip.is_multicast or ip.is_reserved):
                raise ValueError("Only public IPs and hosts are allowed")
            return url
        except ValueError:
            # ignore
            pass

        # Only fully-qualified domain names?
        if len(labels) < 2:
            raise ValueError("Only FQDN hosts are allowed")

        # But what about hostnames like 'co.uk' or 'b.br'?

        # The top-level domain name must not contain a hyphen or digit unless it
        # is an IDN.
        tld = labels[-1]
        if 'xn--' != tld[:4] and re.search('[-0-9]', tld):
            raise ValueError("Invalid URL")

        # RFC 2606, RFC6762, RFC7686, and special purpose domains
        # (.arpa, .int) or recommended for private use (.home, .corp).
        if tld in [
            'example',
            'test',
            'localhost',
            'invalid',
            'local',
            'onion',
            'home',
            'corp',
            'arpa',
            'int'] or ('example' == labels[-2]
                       and tld in ['com', 'net', 'org']):
            raise ValueError("URL not allowed")

        # Some people say that a top-level domain must be at least two
        # characters long. But there's no evidence for that.

        # Leading hyphens or digits, and trailing hyphens are not allowed.
        for label in labels:
            if re.search(r'^[-0-9]', label) or label[-1] == '-':
                raise ValueError("Invalid URL")

        # Unicode.  We allow all characters except the forbidden ones in the
        # ASCII range.
        if re.search('[\x00-\x2c\x2f\x3a-\x60\x7b-\x7f]', url.host):
            raise ValueError("Invalid URL")

        return url
